knitr::opts_chunk$set(
  echo = TRUE,
  include = TRUE,
  message = FALSE,
  warning = FALSE,
  fig.width = 12,
  fig.asp = .6,
  fig.align = "center",
  out.width = "90%"
)

library(tidyverse)
library(dplyr)
library(arsenal)
library(HH)
library(leaps)
library(corrplot) 
library(faraway)
library(broom)
library(ggplot2)
library(MASS)
library(patchwork)

Load Data Set

hate_crime = read_csv("data/HateCrimes.csv", col_types = "fffdddddd") %>% 
  janitor::clean_names() %>% 
  drop_na()

Data Exploration:

Descriptive statistics: Table 1

Descriptive Statistics: Hate Crime Rate After 2016 Election
Overall (N=45)
Level of unemployment
- high 23 (51.1%)
- low 22 (48.9%)
Level of state urbanization
- low 21 (46.7%)
- high 24 (53.3%)
Median Household Income
- Mean (SD) 55299.49 (8979.49)
- Median (Q1, Q3) 54916.00 (48060.00, 60708.00)
- Min - Max 39552.00 - 76165.00
Percent of adults with a high school degree
- Mean (SD) 0.87 (0.03)
- Median (Q1, Q3) 0.87 (0.84, 0.89)
- Min - Max 0.80 - 0.92
Percent of population that are not US citizens
- Mean (SD) 0.06 (0.03)
- Median (Q1, Q3) 0.05 (0.03, 0.08)
- Min - Max 0.01 - 0.13
Income inequality index
- Mean (SD) 0.46 (0.02)
- Median (Q1, Q3) 0.46 (0.44, 0.47)
- Min - Max 0.42 - 0.53
Percent of population that are non-white
- Mean (SD) 0.32 (0.15)
- Median (Q1, Q3) 0.30 (0.21, 0.42)
- Min - Max 0.06 - 0.63
Hate crime rate per 100,000 population
- Mean (SD) 0.30 (0.25)
- Median (Q1, Q3) 0.23 (0.14, 0.35)
- Min - Max 0.07 - 1.52

Visualization

Histogram of Outcome Variable

From the histogram below, we observe our outcome distribution has right skewness, suggesting that we may need to check our normality assumption. Our QQ Plot also indicates severe departures from normality.

#Histogram of Outcome Distribution
hate_crime %>% 
  ggplot(aes(x = hate_crimes_per_100k_splc)) + 
  geom_histogram(color = "red", fill = "black") + 
  labs(
    title = "Distribution of Hate Crime Rates in 50 US States",
    x = "Hate Crime Rate per 100,000 Population",
    y = "Frequency of Distribution",
    caption = "Distribution of Hate Crime Rates ( 50 US States)")

QQ Plot of Outcome Variable
#QQplot of Outcome Distribution
hate_crimes_per_100k_splc = hate_crime$hate_crimes_per_100k_splc
qqnorm(hate_crimes_per_100k_splc, col = 2, pch = 19, cex = 1.5)
qq_plot = qqline(hate_crimes_per_100k_splc, col = 1,lwd = 2,lty = 2)

Shapiro-Wilk Test of Outcome Variable

After performing a Shapiro-Wilk test to check the normality assumption of our outcome distribution, we find evidence to suggest that our data deviates from normality.

# Perform Shapiro-Wilk test
shapiro.test(hate_crimes_per_100k_splc) %>% 
  broom::tidy() %>% 
  knitr::kable("simple")
statistic p.value method
0.7107896 0 Shapiro-Wilk normality test
Comparison of Basic Transformations

We apply a square root transformation and a natural log transformation to our outcome distribution, and compare the results of the data.

sqrt_transformation = hate_crime %>% 
  ggplot(aes(x = sqrt(hate_crimes_per_100k_splc))) + 
  geom_histogram(color = "red", fill = "black") + 
  labs(
    title = "Distribution of sqrt(Hate Crime Rates) in 50 US States",
    x = "sqrt(Hate Crime Rate per 100,000 Population)",
    y = "Frequency of Distribution",
    caption = "Distribution of Hate Crime Rates ( 50 US States)")

sqrt_qqplot = ggplot(hate_crime, aes(sample = sqrt(hate_crimes_per_100k_splc))) + stat_qq() + stat_qq_line() + 
  labs(
    title = "QQ Plot of sqrt(Hate Crime Rates) in 50 US States",
    x = "sqrt(Hate Crime Rate per 100,000 Population)",
    y = "Frequency of Distribution",
    caption = "Distribution of Hate Crime Rates ( 50 US States)")

ln_transformation = hate_crime %>% 
  ggplot(aes(x = log(hate_crimes_per_100k_splc))) + 
  geom_histogram(color = "red", fill = "black") + 
  labs(
    title = "Distribution of ln(Hate Crime Rates) in 50 US States",
    x = "ln(Hate Crime Rate per 100,000 Population)",
    y = "Frequency of Distribution",
    caption = "Distribution of Hate Crime Rates ( 50 US States)")

ln_qqplot = ggplot(hate_crime, aes(sample = log(hate_crimes_per_100k_splc))) + stat_qq() + stat_qq_line() + 
  labs(
    title = "QQ Plot of ln(Hate Crime Rates) in 50 US States",
    x = "ln(Hate Crime Rate per 100,000 Population)",
    y = "Frequency of Distribution",
    caption = "Distribution of Hate Crime Rates ( 50 US States)")
Sqrt vs. Ln Transformations

After visual inspection, we observe that our natural log transformation may be a good candidate to re-test our normality assumptions.

(sqrt_transformation + ln_transformation) / ( sqrt_qqplot + ln_qqplot)

Shapiro-Wilk Test on our Natural Log Transformation

From the results of our test, we observe that we fail to reject the null (our p-value > 0.05) and can state with 95% confidence that our natural log transformation does not significantly deviate from normality, so we can assume normality henceforth.

shapiro.test(log(hate_crimes_per_100k_splc)) %>% 
  broom::tidy() %>% 
  knitr::kable("simple", caption = "Shapiro Wilk Test")
Shapiro Wilk Test
statistic p.value method
0.9830847 0.7452961 Shapiro-Wilk normality test

Identify states with unusual rates and consider them as potential outliers/influential points. (Jyoti)

Identifying states with unusual rates

hate_crime %>%
ggplot(aes(x = hate_crimes_per_100k_splc, y = state, colors = state)) + geom_col(color = "blue") 

Upon Plotting a column graph of the hate crimes against their respective states, we can see that Wyoming, South Dakota, and North Dakota had no values and District of Columbia, Washington, Oregon, Minnesota, Massachussets and Maine showed relatively large columns.

After Plotting a scatter plot of the same values, it was evident that these states were outliers that influenced the data set.

hate_crime %>%
  ggplot(aes(y = hate_crimes_per_100k_splc, x = state, colors = state)) + geom_point(aes(color = state)) +
  geom_smooth(method = "lm", se=F, color = "red")

Income inequality was the main predictor of hate crime rates

Verify if this association holds true, as well as explore associations of all the other covariates mentioned above and draw your own conclusions.(Linh)

hate_crime %>% 
  drop_na() %>% 
  dplyr::select(-state,-unemployment,-urbanization) %>% 
  cor() %>% 
  knitr::kable(digits = 2)
median_household_income perc_population_with_high_school_degree perc_non_citizen gini_index perc_non_white hate_crimes_per_100k_splc
median_household_income 1.00 0.65 0.30 -0.13 0.04 0.34
perc_population_with_high_school_degree 0.65 1.00 -0.26 -0.54 -0.50 0.26
perc_non_citizen 0.30 -0.26 1.00 0.48 0.75 0.24
gini_index -0.13 -0.54 0.48 1.00 0.55 0.38
perc_non_white 0.04 -0.50 0.75 0.55 1.00 0.11
hate_crimes_per_100k_splc 0.34 0.26 0.24 0.38 0.11 1.00
hate_crime %>% 
  drop_na() %>% 
  dplyr::select(-state,-unemployment,-urbanization) %>%  #removing factor variables
  cor() %>% 
  corrplot::corrplot(method = "circle", type = "upper", diag = FALSE)

Multicollinearity

(Nikhita)

# Scatter plot showing associations between numeric variables

hate_crime %>%
  dplyr::select(-state,-unemployment,-urbanization) %>%
  pairs()

Calculating VIF for all the predictors

hate_crime_tidy = hate_crime %>% drop_na()

# fitting MLR model on tidy data without state variable
mult_fit <- lm(hate_crimes_per_100k_splc ~ unemployment + urbanization + median_household_income + perc_population_with_high_school_degree + perc_non_citizen + gini_index + perc_non_white, data = hate_crime_tidy)

vif(mult_fit) %>% knitr::kable()
x
unemploymentlow 1.426492
urbanizationhigh 1.983246
median_household_income 3.108161
perc_population_with_high_school_degree 3.895361
perc_non_citizen 3.728286
gini_index 1.845436
perc_non_white 3.236419

All the predictors have a VIF below 5. This suggests that it would not be problematic to include them in the construction of the model. However, the correlation analysis shows that variables perc_non_white and perc_non_citizen have a moderate linear relationship with a correlation coefficient of 0.75.

Interaction between income equality and unemployment
Without transformation
#Scatter plot - Hate_crime_per_100k_splc vs. gini index by unemploymnet 
ggplot(hate_crimedf, aes(x = gini_index, y = hate_crimes_per_100k_splc, colour = factor(unemployment))) +         
  geom_point(size = 2) +                                                                     
  geom_smooth(method = "lm", se = F,                                          
              aes(group = factor(unemployment),                                  
                  color = factor(unemployment))) +                                        
  labs(title = "Scatterplot of Hate crime per 100k people vs. income equality by unemploymnet status", 
       x = "gini index", y = "hate crime per 100k people") +
  scale_color_manual(name = "Unemployment", labels = c("Low", "High"), values = c("blue", "red"))    

reg1 <- lm(hate_crimes_per_100k_splc ~ gini_index*factor(unemployment), data = hate_crimedf)
summary(reg1)
## 
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ gini_index * factor(unemployment), 
##     data = hate_crimedf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28142 -0.14375 -0.04337  0.06089  0.66008 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                       -0.7256     1.1896  -0.610   0.5453  
## gini_index                         2.3090     2.6588   0.868   0.3902  
## factor(unemployment)1             -2.8730     1.6475  -1.744   0.0887 .
## gini_index:factor(unemployment)1   6.0906     3.6185   1.683   0.0999 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2282 on 41 degrees of freedom
## Multiple R-squared:  0.2334, Adjusted R-squared:  0.1773 
## F-statistic:  4.16 on 3 and 41 DF,  p-value: 0.0116

There is no significant interaction at 5% significance level. The relationship between hate crime per 100k people and income equality does not vary by unemployment status.

With natural log transformation
#Scatter plot - Hate_crime_per_100k_splc vs. gini index by unemploymnet 
ggplot(hate_crimedft, aes(x =gini_index, y = hate_crimes_per_100k_splc, colour = factor(unemployment))) +         
  geom_point(size = 2) +                                                                     
  geom_smooth(method = "lm", se = F,                                          
              aes(group = factor(unemployment),                                  
                  color = factor(unemployment))) +                                        
  labs(title = "Scatterplot of Hate crime per 100k people vs. income equality by unemploymnet status", 
       x = "ln(gini index)", y = "ln(hate crime per 100k people)") +
  scale_color_manual(name = "Unemployment", labels = c("Low", "High"), values = c("blue", "red"))    

reg1t <- lm(hate_crimes_per_100k_splc ~ gini_index*factor(unemployment), data = hate_crimedft)
summary(reg1t)
## 
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ gini_index * factor(unemployment), 
##     data = hate_crimedft)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.55906 -0.57014 -0.00588  0.43404  2.18668 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)                        2.9840     3.8895   0.767    0.447
## gini_index                         4.1938     3.3411   1.255    0.217
## factor(unemployment)1              0.7637     5.2861   0.144    0.886
## gini_index:factor(unemployment)1   1.1717     4.6422   0.252    0.802
## 
## Residual standard error: 0.9096 on 41 degrees of freedom
## Multiple R-squared:  0.1214, Adjusted R-squared:  0.05711 
## F-statistic: 1.888 on 3 and 41 DF,  p-value: 0.1466

There is no significant interaction at 5% significance level. The relationship between hate crime per 100k people and income equality does not vary by unemployment status.

Interaction between income equality and urbanization
Without natural log transformation
#Scatter plot - Hate_crime_per_100k_splc vs. gini index by urbanization 
ggplot(hate_crimedf, aes(x = gini_index, y = hate_crimes_per_100k_splc, colour = factor(urbanization))) +         
  geom_point(size = 2) +                                                                     
  geom_smooth(method = "lm", se = F,                                          
              aes(group = factor(urbanization),                                  
                  color = factor(urbanization))) +                                        
  labs(title = "Scatterplot of Hate crime per 100k people vs. income equality by urbanization status", 
       x = "gini index", y = "hate crime per 100k people") +
  scale_color_manual(name = "Urbanization", labels = c("Low", "High"), values = c("blue", "red"))    

reg2 <- lm(hate_crimes_per_100k_splc ~ gini_index*factor(urbanization), data = hate_crimedf)
summary(reg2)
## 
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ gini_index * factor(urbanization), 
##     data = hate_crimedf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31410 -0.14242 -0.04718  0.08940  0.72556 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                        0.8628     1.5325   0.563   0.5765  
## gini_index                        -1.3825     3.4328  -0.403   0.6892  
## factor(urbanization)1             -3.5465     1.8361  -1.932   0.0603 .
## gini_index:factor(urbanization)1   7.9247     4.0649   1.950   0.0581 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2302 on 41 degrees of freedom
## Multiple R-squared:   0.22,  Adjusted R-squared:  0.1629 
## F-statistic: 3.855 on 3 and 41 DF,  p-value: 0.01612

There is no significant interaction at 5% significance level.The relationship between hate crime per 100k people and income equality does not vary by urbanization status.

With natural log transformation
#Scatter plot - Hate_crime_per_100k_splc vs. gini index by urbanization 
ggplot(hate_crimedft, aes(x = gini_index, y = hate_crimes_per_100k_splc, colour = factor(urbanization))) +         
  geom_point(size = 2) +                                                                     
  geom_smooth(method = "lm", se = F,                                          
              aes(group = factor(urbanization),                                  
                  color = factor(urbanization))) +                                        
  labs(title = "Scatterplot of Hate crime per 100k people vs. income equality by urbanization status", 
       x = "ln(gini index)", y = "ln(hate crime per 100k people)") +
  scale_color_manual(name = "Urbanization", labels = c("Low", "High"), values = c("blue", "red"))    

reg2t <- lm(hate_crimes_per_100k_splc ~ gini_index*factor(urbanization), data = hate_crimedft)
summary(reg2t)
## 
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ gini_index * factor(urbanization), 
##     data = hate_crimedft)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.78892 -0.61547 -0.05282  0.61847  1.81472 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)                        -4.355      4.997  -0.872    0.388
## gini_index                         -1.821      4.285  -0.425    0.673
## factor(urbanization)1               6.974      5.934   1.175    0.247
## gini_index:factor(urbanization)1    5.888      5.163   1.140    0.261
## 
## Residual standard error: 0.9296 on 41 degrees of freedom
## Multiple R-squared:  0.08241,    Adjusted R-squared:  0.01527 
## F-statistic: 1.227 on 3 and 41 DF,  p-value: 0.312

There is no significant interaction at 5% significance level.The relationship between hate crime per 100k people and income equality does not vary by urbanization status.

Model diagnostics

Check model assumptions and goodness of fit (Vihar Desu)

hate_crime = 
  hate_crime %>% 
  drop_na() %>% 
  dplyr::select(-state)

hate_crime_fit = lm(hate_crimes_per_100k_splc ~., data = hate_crime)
summary(hate_crime_fit)
## 
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ ., data = hate_crime)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36552 -0.10314 -0.01316  0.09731  0.51389 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             -8.263e+00  1.897e+00  -4.356 0.000101
## unemploymentlow                          1.307e-02  7.173e-02   0.182 0.856425
## urbanizationhigh                        -3.309e-02  8.475e-02  -0.390 0.698475
## median_household_income                 -1.504e-06  5.961e-06  -0.252 0.802193
## perc_population_with_high_school_degree  5.382e+00  1.835e+00   2.933 0.005735
## perc_non_citizen                         1.233e+00  1.877e+00   0.657 0.515332
## gini_index                               8.624e+00  1.973e+00   4.370 9.67e-05
## perc_non_white                          -5.842e-03  3.673e-01  -0.016 0.987396
##                                            
## (Intercept)                             ***
## unemploymentlow                            
## urbanizationhigh                           
## median_household_income                    
## perc_population_with_high_school_degree ** 
## perc_non_citizen                           
## gini_index                              ***
## perc_non_white                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2014 on 37 degrees of freedom
## Multiple R-squared:  0.461,  Adjusted R-squared:  0.3591 
## F-statistic: 4.521 on 7 and 37 DF,  p-value: 0.001007

Box-Cox without transformation

hate_crime_fit %>% 
  MASS::boxcox()

Take log transformation

hate_crime_log_fit = lm(log(hate_crimes_per_100k_splc) ~ . ,
                    data = hate_crime)

QQ plot for model with transformation and without transformation

par(mfrow = c(1, 2))

# without transformation
qqnorm(resid(hate_crime_fit), xlab = "Expected Value", ylab = "Residual", main = "")
qqline(resid(hate_crime_fit))
title("QQ Plot for Hate crime rate")

# with transformation
qqnorm(resid(hate_crime_log_fit), xlab = "Expected Value", ylab = "Residual", main = "")
qqline(resid(hate_crime_log_fit))
title("QQ Plot for Ln(Hate crime rate)")

Log transformation is better.

Model Selection

Stepwise selection

step(hate_crime_fit, direction = "backward")
## Start:  AIC=-137.03
## hate_crimes_per_100k_splc ~ unemployment + urbanization + median_household_income + 
##     perc_population_with_high_school_degree + perc_non_citizen + 
##     gini_index + perc_non_white
## 
##                                           Df Sum of Sq    RSS     AIC
## - perc_non_white                           1   0.00001 1.5008 -139.03
## - unemployment                             1   0.00135 1.5021 -138.99
## - median_household_income                  1   0.00258 1.5034 -138.95
## - urbanization                             1   0.00618 1.5070 -138.85
## - perc_non_citizen                         1   0.01750 1.5183 -138.51
## <none>                                                 1.5008 -137.03
## - perc_population_with_high_school_degree  1   0.34889 1.8497 -129.62
## - gini_index                               1   0.77465 2.2754 -120.30
## 
## Step:  AIC=-139.03
## hate_crimes_per_100k_splc ~ unemployment + urbanization + median_household_income + 
##     perc_population_with_high_school_degree + perc_non_citizen + 
##     gini_index
## 
##                                           Df Sum of Sq    RSS     AIC
## - unemployment                             1   0.00148 1.5023 -140.99
## - median_household_income                  1   0.00269 1.5035 -140.95
## - urbanization                             1   0.00617 1.5070 -140.85
## - perc_non_citizen                         1   0.02422 1.5250 -140.31
## <none>                                                 1.5008 -139.03
## - perc_population_with_high_school_degree  1   0.38759 1.8884 -130.69
## - gini_index                               1   0.77888 2.2797 -122.22
## 
## Step:  AIC=-140.99
## hate_crimes_per_100k_splc ~ urbanization + median_household_income + 
##     perc_population_with_high_school_degree + perc_non_citizen + 
##     gini_index
## 
##                                           Df Sum of Sq    RSS     AIC
## - median_household_income                  1   0.00243 1.5047 -142.91
## - urbanization                             1   0.00693 1.5092 -142.78
## - perc_non_citizen                         1   0.02401 1.5263 -142.27
## <none>                                                 1.5023 -140.99
## - perc_population_with_high_school_degree  1   0.40517 1.9074 -132.24
## - gini_index                               1   0.78876 2.2910 -124.00
## 
## Step:  AIC=-142.91
## hate_crimes_per_100k_splc ~ urbanization + perc_population_with_high_school_degree + 
##     perc_non_citizen + gini_index
## 
##                                           Df Sum of Sq    RSS     AIC
## - urbanization                             1   0.00762 1.5123 -144.69
## - perc_non_citizen                         1   0.02232 1.5270 -144.25
## <none>                                                 1.5047 -142.91
## - gini_index                               1   0.78737 2.2921 -125.97
## - perc_population_with_high_school_degree  1   0.86254 2.3672 -124.52
## 
## Step:  AIC=-144.69
## hate_crimes_per_100k_splc ~ perc_population_with_high_school_degree + 
##     perc_non_citizen + gini_index
## 
##                                           Df Sum of Sq    RSS     AIC
## - perc_non_citizen                         1   0.01471 1.5270 -146.25
## <none>                                                 1.5123 -144.69
## - gini_index                               1   0.78804 2.3004 -127.81
## - perc_population_with_high_school_degree  1   0.85561 2.3679 -126.51
## 
## Step:  AIC=-146.25
## hate_crimes_per_100k_splc ~ perc_population_with_high_school_degree + 
##     gini_index
## 
##                                           Df Sum of Sq    RSS     AIC
## <none>                                                 1.5270 -146.25
## - perc_population_with_high_school_degree  1   0.85432 2.3813 -128.25
## - gini_index                               1   1.06513 2.5922 -124.44
## 
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ perc_population_with_high_school_degree + 
##     gini_index, data = hate_crime)
## 
## Coefficients:
##                             (Intercept)  
##                                  -8.103  
## perc_population_with_high_school_degree  
##                                   5.059  
##                              gini_index  
##                                   8.825
#lm(hate_crimes_per_100k_splc ~ perc_population_with_high_school_degree + gini_index, data = hate_crime)
stepwise_log_fit = 
  lm(log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree + gini_index, data = hate_crime)
par(mfrow = c(2, 2))
plot(stepwise_log_fit)